%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.graphics.api import qqplot
import plotly
plotly.offline.init_notebook_mode()
import plotly.offline as po
import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf
import plotly.figure_factory as ff
from datetime import datetime, timedelta
symbols = ['ITUB4', 'ABEV3', 'BBDC4', 'PETR4', 'VALE5']
taxas = ['CDI', 'ipca']
# dada frame precos
dados = pd.DataFrame()
for stock in symbols:
df_p = pd.read_excel('./dados/' + stock + '_price.xlsx', index_col = [0], parse_dates = [0])
df_p.columns = [stock + '_price']
df_p.index.name = 'Dates'
dados = pd.concat([dados, df_p],join='outer',axis=1)
# data frame de ipca e risk_free
ipca = pd.read_excel('./dados/ipca.xlsx', index_col = [0], parse_dates = [0])
ipca.index.name = 'Dates'
CDI = pd.read_excel('./dados/CDI.xlsx', index_col = [0], parse_dates = [0])
CDI.index.name = 'Dates'
# dataframe consolidado de precos, ipca e rf
dados = pd.concat([dados, ipca, CDI], join = 'outer', axis = 1)
# dataframe de dividendos
dividends = pd.DataFrame()
for stock in symbols:
df_d = pd.read_excel('./dados/' + stock + '_dividend.xlsx', index_col = [0], parse_dates = [0])
df_d.columns = [stock + '_div']
df_d.index.name = 'Dates'
dividends = pd.concat([dividends, df_d], join = 'outer', axis = 1)
dados = dados.convert_objects(convert_numeric=True)
dados.dtypes
# resample dos dados para biz months
ipca_resamp = pd.DataFrame()
ipca_resamp = dados['ipca'].resample('BM').sum()
dados = dados.drop('ipca', axis = 1)
dados = dados.resample('BM').last()
dados = pd.concat([dados, ipca_resamp], axis = 1, join = 'outer')
# resample para biz quarter e soma dos dividendos
dividends = dividends.resample('BM').sum()
# criando matriz de retornos
Returns = dados.pct_change(1)
xs_ret = pd.DataFrame()
for col in Returns.columns:
xs_ret[col.replace('_price', '') + '_xs_ret'] = Returns[col] - Returns['CDI']
xs_ret = xs_ret.drop('CDI_xs_ret', axis = 1)
xs_ret = xs_ret.drop('ipca_xs_ret', axis = 1)
cf.go_offline()
cf.set_config_file(offline=True, world_readable=False, theme='ggplot')
xs_ret.dropna().iplot(title = 'Excess of Return')
xs_ret.dropna().iplot(kind = 'box', title = 'Excess of Returns boxplot')
# criando uma rolling sum dos ultimos 12 meses dos dividendos
div_12m = pd.DataFrame()
for col in dividends.columns:
div_12m[col + '_12sum'] = np.around(dividends[col].fillna(0).rolling(window = 12).sum(), decimals=5)
dp = pd.DataFrame()
for col in dados.columns:
if 'price' in col:
dp[col.replace('price', '') + 'dp'] = div_12m[col.replace('price', '') + 'div_12sum'] / dados[col]
dp.dropna().iplot(title = 'Dividend-Price ratio')
dy = pd.DataFrame()
for col in dados.columns:
if 'price' in col:
dy[col.replace('price', '') + 'dy'] = div_12m[col.replace('price', '') + 'div_12sum'] / dados[col].shift(1)
dy.dropna().iplot(title = 'Dividend-Yield')
daily = pd.DataFrame()
for stock in symbols:
df_p = pd.read_excel('./dados/' + stock + '_price.xlsx', index_col = [0], parse_dates = [0])
df_p.columns = [stock + '_svar']
df_p.index.name = 'Dates'
daily = pd.concat([daily, df_p],join='outer',axis=1)
svar = pd.DataFrame()
daily = daily.convert_objects(convert_numeric=True)
daily = daily.pct_change(1)
svar = daily ** 2
svar = svar.resample('BM').sum()
svar.iplot(title = 'SVAR')
Returns['CDI'].dropna().iplot(title = 'CDI Returns')
dados.loc['2000-01-01':, 'ipca'].iplot(title = 'Monthly IPCA')
all_common_data = pd.DataFrame()
all_common_data = pd.concat([xs_ret, dp, dy, svar, Returns['CDI'], dados['ipca']], axis = 1, join = 'inner')
all_common_data = all_common_data.replace([np.inf, -np.inf], np.nan)
Abritariamente, vamos escolher o perÃodo out-of-sample como sendo de 2011.01 até 2017.06
# calculando a expanding media historica
for col in all_common_data.columns:
if '_xs' in col:
all_common_data[col + 'hist_mean'] = np.around(all_common_data[col].expanding().mean(), decimals=5)
all_common_data[col + 'hist_var'] = np.around(all_common_data[col].expanding().var(), decimals=5)
all_common_data = sm.add_constant(all_common_data).fillna(0)
cutoff_period = datetime(2011, 1, 31).isoformat()
cutoff_period
q = all_common_data.loc[cutoff_period:, :].shape[0] - 1
q
regressor_list = ['_dp', '_dy', '_svar', 'CDI', 'ipca']
columns_r_hat = []
for col in regressor_list:
for j in symbols:
if '_' in col:
columns_r_hat = columns_r_hat + [j + col + '_hat']
else:
columns_r_hat = columns_r_hat + [j + '_' + col + '_hat']
#columns_r_hat
r_hat = pd.DataFrame(index = all_common_data.index, columns= columns_r_hat)
for i in all_common_data.loc[cutoff_period:, :].index.tolist():
for tgt in symbols:
for reg in regressor_list:
if '_' in reg:
model = sm.OLS(all_common_data.loc[:i, tgt + '_xs_ret'],
all_common_data.loc[:i, ['const', tgt+reg]])
res = model.fit()
r_hat.loc[i+1:, tgt+reg+'_hat'] = res.predict(all_common_data.loc[i+1:, ['const', tgt+reg]])
else:
model = sm.OLS(all_common_data.loc[:i, tgt + '_xs_ret'],
all_common_data.loc[:i, ['const', reg]])
res = model.fit()
r_hat.loc[i+1:, tgt+'_'+reg+'_hat'] = res.predict(all_common_data.loc[i+1:, ['const', reg]])
#r_hat = r_hat.dropna()
combined = pd.DataFrame(index = r_hat.index)
for sym in symbols:
combined[sym + '_mean_hat'] = r_hat[[c for c in r_hat.columns if sym in c]].mean(axis = 1)
combined[sym + '_median_hat'] = r_hat[[c for c in r_hat.columns if sym in c]].median(axis = 1)
all_common_data = pd.concat([all_common_data, r_hat, combined], axis = 1, join = 'outer')
#theta = pd.DataFrame()
#
#for sym in symbols:
# for model in regressor_list:
# if '_' in model:
# theta[sym+model+'_hat'] = (all_common_data[sym + '_xs_ret'] - all_common_data[sym+model+'_hat']) ** 2
# else:
# theta[sym+'_'+model+'_hat'] = (all_common_data[sym + '_xs_ret'] - all_common_data[sym+'_'+model+'_hat']) ** 2
#theta = theta.dropna()
#theta = theta.expanding().sum()
#for sym in symbols:
# theta[sym + '_sum'] = theta[[col for col in theta.columns if sym in col]].sum(axis = 1)
#for col in theta.columns:
# theta[col + '_theta09'] = np.zeros(theta.shape[0])
#
#for col in theta.columns:
# if '_theta09' not in col:
# for row in range(0, q):
# theta.ix[row, col] = theta.ix[row, col] ** (0.9 ** (row + 1))
r_os = pd.DataFrame()
for sym in symbols:
for col in [c for c in all_common_data.columns if 'hat' in c]:
if sym in col:
r_os[col + '_num'] = ((all_common_data.loc['2011-02-28':, sym + '_xs_ret'] - all_common_data.loc['2011-02-28':, col]) ** 2)
r_os[col + '_den'] = ((all_common_data.loc['2011-02-28':, sym + '_xs_ret'] - all_common_data.loc['2011-02-28':, sym + '_xs_rethist_mean']) ** 2)
r_os[col + '_adj-mspe'] = ((all_common_data.loc['2011-02-28':, sym + '_xs_ret'] - all_common_data.loc['2011-02-28':, sym + '_xs_rethist_mean']) ** 2) - (((all_common_data.loc['2011-02-28':, sym + '_xs_ret'] - all_common_data.loc['2011-02-28':, col]) ** 2) - ((all_common_data.loc['2011-02-28':, sym + '_xs_rethist_mean'] - all_common_data.loc['2011-02-28':, col]) ** 2))
const = pd.DataFrame(index = r_os.index)
const = sm.add_constant(const)
#const = pd.concat([const, all_common_data.loc[cutoff_period:, sym + '_xs_rethist_mean']], axis = 1, join = 'inner').dropna()
test = pd.DataFrame(index = ['0'])
for sym in symbols:
for col in [c for c in r_os.columns if 'mspe' in c]:
if sym in col:
m = sm.OLS(np.array(r_os[col].dropna().values, dtype = 'float'),
np.array(const.iloc[:, 0].values, dtype = 'float'))
r = m.fit()
test[col] = r.pvalues[0]
aux = pd.DataFrame(round((1 - (r_os[[c for c in r_os.columns if 'num' in c]].sum() / r_os[[c for c in r_os.columns if 'den' in c]].sum().values))*100, 2))
results = pd.DataFrame(index = ['dp', 'dy', 'svar', 'CDI', 'ipca', 'mean', 'median'],
columns=['ITUB4', 'PETR4', 'VALE5', 'BBDC4', 'ABEV3'])
for sym in symbols:
for i in results.index:
for j in aux.index:
if i in j:
if sym in j:
results.loc[i, sym] = str(aux.loc[j, 0]) + ' (' + str(round(test.loc['0', j.replace('_num', '') + '_adj-mspe'],2)) + ')'
po.iplot(ff.create_table(results, index=True))
all_common_data[[c for c in all_common_data.columns if 'ITUB' in c]].iplot(title = 'ITUB4')
all_common_data[[c for c in all_common_data.columns if 'PETR' in c]].iplot(title = 'PETR4')
all_common_data[[c for c in all_common_data.columns if 'VALE5' in c]].iplot(title = 'VALE5')
all_common_data[[c for c in all_common_data.columns if 'ABEV3' in c]].iplot(title = 'ABEV3')
all_common_data[[c for c in all_common_data.columns if 'BBDC4' in c]].iplot(title = 'BBDC4')
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import sklearn.tree
import pydotplus
X = all_common_data[[c for c in all_common_data.columns if 'ITUB' in c]]
X = X[[c for c in X.columns if 'hat' not in c]]
X = X[[c for c in X.columns if 'hist' not in c]]
y = X[[c for c in X.columns if 'xs' in c]]
X = X[[c for c in X.columns if 'xs' not in c]]
X = pd.concat([X, all_common_data['CDI'], all_common_data['ipca']], axis = 1, join = 'outer')
X_train = X.loc[:cutoff_period, :].dropna().values
X_test = X.loc['2011-02-28':, :].values
y_train = y.loc[:cutoff_period, :].values
y_test = y.loc['2011-02-28':, :].values
dtree = DecisionTreeRegressor()
dtree.fit(X_train, y_train)
y_pred_tree = dtree.predict(X_test)
print(dtree.score(X_test, y_test))
y_tree = pd.DataFrame(index = all_common_data.loc['2011-02-28':, :].index.tolist())
y_tree['pred'] = y_pred_tree
y_tree['true'] = y_test
y_tree.iplot(title = 'Tree - ITUB4')
X = all_common_data[[c for c in all_common_data.columns if 'ITUB' in c]]
X = X[[c for c in X.columns if 'hat' not in c]]
X = X[[c for c in X.columns if 'hist' not in c]]
y = X[[c for c in X.columns if 'xs' in c]]
X = X[[c for c in X.columns if 'xs' not in c]]
X = pd.concat([X, all_common_data['CDI'], all_common_data['ipca']], axis = 1, join = 'outer')
X_train = X.loc[:cutoff_period, :].dropna().values
X_test = X.loc['2011-02-28':, :].values
y_train = y.loc[:cutoff_period, :].values
y_test = y.loc['2011-02-28':, :].values
dtree = RandomForestRegressor(n_estimators=200, random_state=0, bootstrap=True)
dtree.fit(X_train, y_train)
y_pred_tree = dtree.predict(X_test)
print(dtree.score(X_test, y_test))
y_tree = pd.DataFrame(index = all_common_data.loc['2011-02-28':, :].index.tolist())
y_tree['pred'] = y_pred_tree
y_tree['true'] = y_test
y_tree.iplot(title = 'Random Forest - ITUB4')
X = all_common_data[[c for c in all_common_data.columns if 'VALE' in c]]
X = X[[c for c in X.columns if 'hat' not in c]]
X = X[[c for c in X.columns if 'hist' not in c]]
y = X[[c for c in X.columns if 'xs' in c]]
X = X[[c for c in X.columns if 'xs' not in c]]
X = pd.concat([X, all_common_data['CDI'], all_common_data['ipca']], axis = 1, join = 'outer')
X_train = X.loc[:cutoff_period, :].dropna().values
X_test = X.loc['2011-02-28':, :].values
y_train = y.loc[:cutoff_period, :].values
y_test = y.loc['2011-02-28':, :].values
dtree = DecisionTreeRegressor()
dtree.fit(X_train, y_train)
y_pred_tree = dtree.predict(X_test)
print(dtree.score(X_test, y_test))
y_tree = pd.DataFrame(index = all_common_data.loc['2011-02-28':, :].index.tolist())
y_tree['pred'] = y_pred_tree
y_tree['true'] = y_test
y_tree.iplot(title = 'Tree - VALE5')
X = all_common_data[[c for c in all_common_data.columns if 'VALE' in c]]
X = X[[c for c in X.columns if 'hat' not in c]]
X = X[[c for c in X.columns if 'hist' not in c]]
y = X[[c for c in X.columns if 'xs' in c]]
X = X[[c for c in X.columns if 'xs' not in c]]
X = pd.concat([X, all_common_data['CDI'], all_common_data['ipca']], axis = 1, join = 'outer')
X_train = X.loc[:cutoff_period, :].dropna().values
X_test = X.loc['2011-02-28':, :].values
y_train = y.loc[:cutoff_period, :].values
y_test = y.loc['2011-02-28':, :].values
dtree = RandomForestRegressor(n_estimators=200, random_state=0, bootstrap=True)
dtree.fit(X_train, y_train)
y_pred_tree = dtree.predict(X_test)
print(dtree.score(X_test, y_test))
y_tree = pd.DataFrame(index = all_common_data.loc['2011-02-28':, :].index.tolist())
y_tree['pred'] = y_pred_tree
y_tree['true'] = y_test
y_tree.iplot(title = 'Random Forest - VALE5')
from sklearn.linear_model import ElasticNetCV
X = all_common_data[[c for c in all_common_data.columns if 'ITUB' in c]]
X = X[[c for c in X.columns if 'hat' not in c]]
X = X[[c for c in X.columns if 'hist' not in c]]
y = X[[c for c in X.columns if 'xs' in c]]
X = X[[c for c in X.columns if 'xs' not in c]]
X = pd.concat([X, all_common_data['CDI'], all_common_data['ipca']], axis = 1, join = 'outer')
X_train = X.loc[:cutoff_period, :].dropna().values
X_test = X.loc['2011-02-28':, :].values
y_train = y.loc[:cutoff_period, :].values
y_test = y.loc['2011-02-28':, :].values
reg = ElasticNetCV(l1_ratio=0.5, cv = 5, random_state=0)
reg.fit(X_train, y_train)
y_pred_en = reg.predict(X_test)
print(reg.score(X_test, y_test))
y_en = pd.DataFrame(index = all_common_data.loc['2011-02-28':, :].index.tolist())
y_en['pred'] = y_pred_en
y_en['true'] = y_test
y_en.iplot(title = 'Elastic Net CV - ITUB4')
X = all_common_data[[c for c in all_common_data.columns if 'VALE' in c]]
X = X[[c for c in X.columns if 'hat' not in c]]
X = X[[c for c in X.columns if 'hist' not in c]]
y = X[[c for c in X.columns if 'xs' in c]]
X = X[[c for c in X.columns if 'xs' not in c]]
X = pd.concat([X, all_common_data['CDI'], all_common_data['ipca']], axis = 1, join = 'outer')
X_train = X.loc[:cutoff_period, :].dropna().values
X_test = X.loc['2011-02-28':, :].values
y_train = y.loc[:cutoff_period, :].values
y_test = y.loc['2011-02-28':, :].values
reg = ElasticNetCV(l1_ratio=0.5, cv = 5, random_state=0)
reg.fit(X_train, y_train)
y_pred_en = reg.predict(X_test)
print(reg.score(X_test, y_test))
y_en = pd.DataFrame(index = all_common_data.loc['2011-02-28':, :].index.tolist())
y_en['pred'] = y_pred_en
y_en['true'] = y_test
y_en.iplot(title = 'Elastic Net CV - VALE5')
Analise de reports das companhias
Clusters de manchetes